Qualitative Analysis and Machine Learning
Photo by Towfiqu barbhuiya on Unsplash
You can make your heart younger by making changes that reduce your risk…
— Center for Disease Control
# Load csv data file
df = read.csv("archetypes/heart.csv", header = TRUE, stringsAsFactors = FALSE)
df
data <- df
colnames(data) <- c("age", "sex", "chest_pain", "blood_pressure", "cholestoral", "blood_sugar", "ecg", "heart_rate", "angina", "ST_induced_exercise", "ST_slope", "fluroscopy", "thal", "target")
summary(data)
## age sex chest_pain blood_pressure
## Min. :29.00 Min. :0.0000 Min. :0.000 Min. : 94.0
## 1st Qu.:47.50 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:120.0
## Median :55.00 Median :1.0000 Median :1.000 Median :130.0
## Mean :54.37 Mean :0.6832 Mean :0.967 Mean :131.6
## 3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:140.0
## Max. :77.00 Max. :1.0000 Max. :3.000 Max. :200.0
## cholestoral blood_sugar ecg heart_rate
## Min. :126.0 Min. :0.0000 Min. :0.0000 Min. : 71.0
## 1st Qu.:211.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:133.5
## Median :240.0 Median :0.0000 Median :1.0000 Median :153.0
## Mean :246.3 Mean :0.1485 Mean :0.5281 Mean :149.6
## 3rd Qu.:274.5 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:166.0
## Max. :564.0 Max. :1.0000 Max. :2.0000 Max. :202.0
## angina ST_induced_exercise ST_slope fluroscopy
## Min. :0.0000 Min. :0.00 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.80 Median :1.000 Median :0.0000
## Mean :0.3267 Mean :1.04 Mean :1.399 Mean :0.7294
## 3rd Qu.:1.0000 3rd Qu.:1.60 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.20 Max. :2.000 Max. :4.0000
## thal target
## Min. :0.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:0.0000
## Median :2.000 Median :1.0000
## Mean :2.314 Mean :0.5446
## 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :3.000 Max. :1.0000
cat_cols <- c("sex", "chest_pain", "blood_sugar", "ecg", "angina", "ST_slope", "thal", "target")
data[cat_cols] <- lapply(data[cat_cols], factor)
levels(data$sex) <- c("Female", "Male")
levels(data$chest_pain) <- c("Asymptomatic", "Atypical angina", "No angina", "Typical angina")
levels(data$blood_sugar) <- c("> 120mg/dl", "< 120mg/dl")
levels(data$ecg) <- c("Hypertrophy", "Normal", "Abnormalities")
levels(data$angina) <- c("No", "Yes")
levels(data$ST_slope) <- c("Descending", "Flat", "Ascending")
levels(data$thal) <- c("Error", "Fixed defect", "Normal flow", "Reversible defect")
levels(data$target) <- c("Yes", "No")
head(data)
v0<-ggplot(data, aes(target, fill=target)) +
geom_bar() +
labs(x="Disease", y="Number of patients", title = "Number of Patients with Heart Diseases") +
guides(fill=FALSE)
girafe(ggobj = v0, width_svg = 720/72, height_svg = 405/72, options =
list(opts_sizing(rescale = TRUE, width = 1.0)))
v1 <- ggplot(data, aes(age, fill=target)) +
geom_histogram(binwidth=1) +
labs(fill="Disease", x="Age", y="Number of patients", title = "Relationship between Age and Heart Disease")
v2 <- ggplot(data, aes(sex, fill=target)) +
geom_bar() +
labs(fill="Disease", x="Sex", y="Number of patients", title = "Relationship between Sex and Heart Disease")
v3 <- ggplot(data, aes(chest_pain, fill=target)) +
geom_bar() +
labs(fill="Disease", x="Chest pain type", y="Number of patients", title = "Chest Pain and Heart Disease")
v4 <- ggplot(data, aes(blood_pressure, fill=target)) +
geom_histogram(binwidth=3) +
labs(fill="Disease", x="Blood pressure (mm Hg)", y="Number of patients", title = "Blood Pressure and Heart Disease")
v5 <- ggplot(data, aes(cholestoral, fill=target)) +
geom_histogram(binwidth=10) +
labs(fill="Disease", x="Cholesterol (mg/dl)", y="Number of patients", title = "Cholesterol and Heart Disease")
v6 <- ggplot(data, aes(blood_sugar, fill=target)) +
geom_bar() +
labs(fill="Disease", x="Sugar levels", y="Number of patients", title = "Blood Sugar and Heart Disease")
v7 <- ggplot(data, aes(ecg, fill=target)) +
geom_bar() +
labs(fill="Disease", x="Electrocardiogram on rest", y="Number of patients", title = "Resting ECG and Heart Disease")
v8 <- ggplot(data, aes(heart_rate, fill=target)) +
geom_histogram(binwidth=10) +
labs(fill="Disease", x="Maximum heart rate during exercise", y="Number of patients", title = "Heart Rate and Heart Disease")
v9 <- ggplot(data, aes(angina, fill=target)) +
geom_bar() +
labs(fill="Disease", x="Presence of angina during exercise", y="Number of patients", title = "Angina and Heart Disease")
v10 <- ggplot(data, aes(ST_slope, fill=target)) +
geom_bar() +
labs(fill="Disease", x="Slope of the ST segment", y="Number of patients", title = "ST_slope and Heart Disease")
v11 <- ggplot(data, aes(fluroscopy, fill=target)) +
geom_bar() +
labs(fill="Disease", x="Number of main blood vessels coloured", y="Number of patients", title = "Fluroscopy and Heart Disease")
v12 <- ggplot(data, aes(thal, fill=target)) +
geom_bar() +
labs(fill="Disease", x="Results of the blood flow", y="Number of patients", title = "Blood Flow and Heart Disease")
patch = v1 + v2 + v3 + v4 + v5 + v6 + v7 + v8 + v9 + v10 + v11 + v12 + plot_layout(ncol=4, nrow=3)
#patch
girafe(ggobj = patch, width_svg = 1280/72, height_svg = 720/72, options =
list(opts_sizing(rescale = TRUE, width = 1.0)))
set.seed(213)
library(caret)
df$target <- as.factor(df$target)
hd_index <- createDataPartition(df$target, p = .75, list = FALSE)
hd_train <- df[ hd_index, ]
hd_test <- df[-hd_index, ]
fitControl <- trainControl(method="cv", number=5, savePredictions = "final")
set.seed(8)
lr <- train(target ~ .,
data = hd_train,
method = "glm",
family=binomial(),
trControl = fitControl)
lr
## Generalized Linear Model
##
## 228 samples
## 13 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 182, 182, 182, 182, 184
## Resampling results:
##
## Accuracy Kappa
## 0.812253 0.6173789
importance = varImp(lr)
PlotImportance = function(importance)
{
varImportance <- data.frame(Variables = row.names(importance[[1]]),
Importance = round(importance[[1]]$Overall,2))
# Create a rank variable based on importance
rankImportance <- varImportance %>%
mutate(Rank = paste0('#',dense_rank(desc(Importance))))
rankImportancefull = rankImportance
ggplot(rankImportance, aes(x = reorder(Variables, Importance),
y = Importance)) +
geom_bar(stat='identity',colour="white", fill = "#2185DC") +
geom_text(aes(x = Variables, y = 1, label = Rank),
hjust=0, vjust=.5, size = 4, colour = 'black',
fontface = 'bold') +
labs(x = 'Variables', title = 'Relative Variable Importance') +
coord_flip() +
theme_bw()
}
v20 <- PlotImportance(importance)
girafe(ggobj = v20, width_svg = 1280/72, height_svg = 720/72, options =
list(opts_sizing(rescale = TRUE, width = 1.0)))
library(caretEnsemble)
algorithmList <- c('rf', 'nb', 'knn', 'ada', 'svmRadial')
models <- caretList(target ~ .,
data=df,
trControl=fitControl,
methodList=algorithmList,
tuneList = NULL,
preProcess = c("center","scale"))
results <- resamples(models)
#dotplot(results, metric = "Accuracy")
#dotplot(results, metric = "Kappa")
models$ada
## Boosted Classification Trees
##
## 303 samples
## 13 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 243, 242, 243, 242, 242
## Resampling results across tuning parameters:
##
## maxdepth iter Accuracy Kappa
## 1 50 0.8280874 0.6475908
## 1 100 0.8379781 0.6689036
## 1 150 0.8577596 0.7093393
## 2 50 0.8380328 0.6692907
## 2 100 0.8346995 0.6617045
## 2 150 0.8346448 0.6620925
## 3 50 0.8215301 0.6348308
## 3 100 0.8346448 0.6617884
## 3 150 0.8214208 0.6354736
##
## Tuning parameter 'nu' was held constant at a value of 0.1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were iter = 150, maxdepth = 1 and nu = 0.1.
model_results <- data.frame(
Random_Forest = c(mean(models$rf$results$Accuracy),mean(models$rf$results$Kappa)) ,
Naive_Bayes = c(mean(models$nb$results$Accuracy),mean(models$nb$results$Kappa)),
KNN = c(mean(models$knn$results$Accuracy),mean(models$knn$results$Kappa)),
Decision_Tree = c(mean(models$ada$results$Accuracy),mean(models$ada$results$Kappa)),
SVM = c(mean(models$svmRadial$results$Accuracy), mean(models$svmRadial$results$Kappa)),
Logistic_Regress = c(mean(lr$results$Accuracy),mean(lr$results$Kappa))
)
row.names(model_results) <- c("Accuracy", "Kappa")
model_results <- as.data.frame(t(model_results))
model_results <- cbind(methods = rownames(model_results), model_results)
rownames(model_results)<-NULL
model_results
v21 <- ggplot(model_results, aes(x=Accuracy, y=Kappa, color = methods))+
geom_point(size=4)
girafe(ggobj = v21, width_svg = 1280/72, height_svg = 720/72, options =
list(opts_sizing(rescale = TRUE, width = 1.0)))
pred_rf <- predict.train(models$rf, hd_test)
pred_nb <- predict.train(models$nb, hd_test)
pred_knn <- predict.train(models$knn, hd_test)
pred_ada <- predict.train(models$ada, hd_test)
pred_svm <- predict.train(models$svmRadial, hd_test)
pred_lr <- predict(lr, hd_test)
confusionMatrix(pred_ada, hd_test$target, mode = "prec_recall")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 27 3
## 1 7 38
##
## Accuracy : 0.8667
## 95% CI : (0.7684, 0.9342)
## No Information Rate : 0.5467
## P-Value [Acc > NIR] : 0.000000003311
##
## Kappa : 0.7283
##
## Mcnemar's Test P-Value : 0.3428
##
## Precision : 0.9000
## Recall : 0.7941
## F1 : 0.8438
## Prevalence : 0.4533
## Detection Rate : 0.3600
## Detection Prevalence : 0.4000
## Balanced Accuracy : 0.8605
##
## 'Positive' Class : 0
##
plotConfusion <- function(pred) {
z <- confusionMatrix(pred, hd_test$target, mode = "prec_recall")$table
True <- factor(c(0, 0, 1, 1))
Predicted <- factor(c(0, 1, 0, 1))
Y <- as.numeric(z)
conf <- data.frame(True, Predicted, Y)
ggplot(data = conf, mapping = aes(x = True, y = Predicted)) +
geom_tile(aes(fill = Y), colour = "white") +
geom_text(aes(label = sprintf("%1.0f", Y)), vjust = 1) +
scale_fill_gradient(low = "pink", high = "cyan") +
theme_bw() + theme(legend.position = "none")
}
conf_nb <- plotConfusion(pred_nb)+ggtitle('Naive Bayes')
conf_knn <- plotConfusion(pred_knn)+ggtitle('K Nearest Neighbors')
conf_rf <- plotConfusion(pred_rf)+ggtitle('Random Forest')
conf_ada <- plotConfusion(pred_ada)+ggtitle('Decision Trees')
conf_svm <- plotConfusion(pred_svm)+ggtitle('Support Vector Machines')
conf_lr <- plotConfusion(pred_lr)+ggtitle('Logistic Regression')
patch2 = conf_rf + conf_nb + conf_knn + conf_ada+ conf_svm + conf_lr + plot_layout(ncol=3, nrow=2)
#patch
girafe(ggobj = patch2, width_svg = 1280/72, height_svg = 720/72, options =
list(opts_sizing(rescale = TRUE, width = 1.0)))